Apache License 2.0
library(raster)
We start with two source lines and right after them the code from those two scripts.
source("src/RMSE.r")
Calculate the Root Mean Squared Error.
RMSE = function(truth, prediction)
{
return(sqrt(mean((truth-prediction)^2, na.rm=TRUE)))
}
Calculate partial RMSE for different zones by generating squared difference rasters.
GetDifferenceRaster = function(former, latter, filename=paste("data/", names(latter), ".grd", sep=""), force=FALSE)
{
# Figure out a reasonable output name
if (filename == "data/layer.grd")
warning("Please pass filename or assign meaningful names to input rasters! Else the loaded data may not match expectations.")
# Create one or load one
if (!file.exists(filename) || force)
{
return(overlay(former, latter, fun=function(truth, prediction){(truth-prediction)^2},
filename=filename))
}
else
return(raster(filename))
}
Calculate RMSE per zone.
StratifiedRMSE = function(truth, prediction, zones, zonenames = "", ...)
{
# Unfortunately, zonal() just passes a vector of numbers, not a matrix.
# So we have to calclate RMSE manually from a difference raster.
differenceRaster = GetDifferenceRaster(truth, prediction, ...)
# Get a mean from the zones in the difference raster
zonestats = zonal(differenceRaster, zones, fun="mean")
# Add nice labels, if we have them
if (length(zonenames) > 1)
rownames(zonestats) = zonenames
return(zonestats)
}
The second source file starts here.
source("src/LinearModel.r")
Creates a linear model and prints its summary, and if desired whether some covariates ought to be removed from the model, and a plot of residuals.
lmValidation = function(..., printstep=FALSE, printplot=FALSE)
{
LM = lm(...)
print(summary(LM))
if (printstep)
step(LM)
if (printplot)
plot(LM)
return(LM)
}
# Creates a prediction based on the model, clamps values to a given range, and
# if requested plots a histogram and/or a comparison plot between the layers
predictValidation = function(object, ..., truthlayer = "", range = c(-Inf, +Inf), plothist = FALSE, plotcomparison = FALSE)
{
Prediction = predict(object, ...)
Prediction[Prediction < range[1]] = NA
Prediction[Prediction > range[2]] = NA
if (plothist)
hist(Prediction, breaks = 200, main="Histogram of predicted values")
if (plotcomparison)
{
TruthExists = try(class(object[[truthlayer]]))
if (class(TruthExists) == "try-error")
{
warning("Requested a comparison plot, but could not find a layer to compare to!")
return(Prediction)
}
op = par(mfrow=c(1,2))
plot(Prediction, colNA="black", main="Predicted values")
plot(object[[truthlayer]], colNA="black", main="Ground truth")
par(op)
}
return(Prediction)
}
GetFromWURgit = function(filename)
{
download.file(paste("https://github.com/GeoScripting-WUR/AdvancedRasterAnalysis/raw/gh-pages/data/", filename, sep=""),
paste("data/", filename, sep=""), "wget")
return(paste("data/", filename, sep=""))
}
load(GetFromWURgit("GewataB1.rda"))
load(GetFromWURgit("GewataB2.rda"))
load(GetFromWURgit("GewataB3.rda"))
load(GetFromWURgit("GewataB4.rda"))
load(GetFromWURgit("GewataB5.rda"))
load(GetFromWURgit("GewataB7.rda"))
load(GetFromWURgit("vcfGewata.rda"))
load(GetFromWURgit("trainingPoly.rda"))
DataBrick = brick(GewataB1, GewataB2, GewataB3, GewataB4, GewataB5, GewataB7, vcfGewata)
names(DataBrick) = c("Blue", "Green", "Red", "NIR", "SWIR", "Emission", "VCF")
Sanitise data
DataBrick[["VCF"]][DataBrick[["VCF"]] > 100] = NA
DataValues = as.data.frame(getValues(DataBrick))
Create a training raster for RMSE
trainingRaster = rasterize(trainingPoly, DataBrick[["VCF"]], field='Class', filename="data/trainingRaster.grd", overwrite=TRUE)
## Found 16 region(s) and 16 polygon(s)
Here we produce plots that demonstrate the relationship between the Landsat bands and the VCF tree cover.
pairs(DataBrick)
From this we can conclude that they are all negatively correlated with VCF, except for NIR. The bands are also highly correlated with each other.
Here we create a lm() model and show a summary with our own lmValidation()
LM = lmValidation(VCF ~ Blue + Green + Red + NIR + SWIR + Emission, data=DataValues, printstep = TRUE)
##
## Call:
## lm(formula = ..1, data = ..2)
##
## Residuals:
## Min 1Q Median 3Q Max
## -58.205 -4.636 0.715 5.211 258.436
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 8.549e+01 5.723e-02 1493.805 <2e-16 ***
## Blue 9.291e-02 1.889e-04 491.817 <2e-16 ***
## Green -1.505e-01 2.353e-04 -639.622 <2e-16 ***
## Red -2.528e-03 1.698e-04 -14.889 <2e-16 ***
## NIR 1.661e-02 2.790e-05 595.378 <2e-16 ***
## SWIR -2.006e-02 7.036e-05 -285.119 <2e-16 ***
## Emission 2.549e-05 8.969e-05 0.284 0.776
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 8.606 on 1808277 degrees of freedom
## (13712 observations deleted due to missingness)
## Multiple R-squared: 0.8582, Adjusted R-squared: 0.8582
## F-statistic: 1.824e+06 on 6 and 1808277 DF, p-value: < 2.2e-16
##
## Start: AIC=7784664
## VCF ~ Blue + Green + Red + NIR + SWIR + Emission
##
## Df Sum of Sq RSS AIC
## - Emission 1 6 133937265 7784662
## <none> 133937259 7784664
## - Red 1 16420 133953679 7784884
## - SWIR 1 6021286 139958546 7864181
## - Blue 1 17916072 151853332 8011681
## - NIR 1 26255617 160192877 8108358
## - Green 1 30302856 164240115 8153476
##
## Step: AIC=7784662
## VCF ~ Blue + Green + Red + NIR + SWIR
##
## Df Sum of Sq RSS AIC
## <none> 133937265 7784662
## - Red 1 21399 133958664 7784949
## - SWIR 1 17210900 151148165 8003262
## - Blue 1 18173224 152110489 8014738
## - Green 1 31342888 165280153 8164889
## - NIR 1 34235262 168172528 8196260
Emission can be dropped
LM = lmValidation(VCF ~ Blue + Green + Red + NIR + SWIR, data=DataValues, printstep = TRUE)
##
## Call:
## lm(formula = ..1, data = ..2)
##
## Residuals:
## Min 1Q Median 3Q Max
## -58.204 -4.636 0.715 5.211 258.586
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 8.550e+01 5.601e-02 1526.5 <2e-16 ***
## Blue 9.291e-02 1.876e-04 495.3 <2e-16 ***
## Green -1.505e-01 2.314e-04 -650.5 <2e-16 ***
## Red -2.504e-03 1.473e-04 -17.0 <2e-16 ***
## NIR 1.661e-02 2.443e-05 679.9 <2e-16 ***
## SWIR -2.004e-02 4.158e-05 -482.0 <2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 8.606 on 1808278 degrees of freedom
## (13712 observations deleted due to missingness)
## Multiple R-squared: 0.8582, Adjusted R-squared: 0.8582
## F-statistic: 2.189e+06 on 5 and 1808278 DF, p-value: < 2.2e-16
##
## Start: AIC=7784662
## VCF ~ Blue + Green + Red + NIR + SWIR
##
## Df Sum of Sq RSS AIC
## <none> 133937265 7784662
## - Red 1 21399 133958664 7784949
## - SWIR 1 17210900 151148165 8003262
## - Blue 1 18173224 152110489 8014738
## - Green 1 31342888 165280153 8164889
## - NIR 1 34235262 168172528 8196260
Nothing further can be dropped, the most significant bands are NIR and green.
NB: a linear model isn’t very appropriate, because normality and independence assumptions are violated!
BrickSubset = dropLayer(DataBrick, "Emission")
Here we predict the values and plot a histogram using the predictValidation().
Prediction = predictValidation(BrickSubset, model=LM, na.rm=TRUE, plothist=TRUE)
In the histogram we see that some values are out of range, we apply range constraints to the prediction analysis.
The results are plotted below.
Prediction = predictValidation(BrickSubset, model=LM, na.rm=TRUE, overwrite=TRUE, filename="data/PredictionRaster.grd",
truthlayer="VCF", range=c(0, +Inf), plotcomparison=TRUE)
names(Prediction) = "Predicted.LM"
Compute the RMSE:
RMSE(getValues(DataBrick[["VCF"]]), getValues(Prediction))
## [1] 8.35677
RMSE per zone:
zonestats = StratifiedRMSE(DataBrick[["VCF"]], Prediction, trainingRaster, zonenames=levels(trainingPoly@data$Class))
zonestats
## zone mean
## cropland 1 78.55089
## forest 2 24.87072
## wetland 3 141.92961
As a bonus, plot the difference raster:
plot(raster("data/Predicted.LM.grd"))
Here we repeat the analysis and only use independent variables:
ReducedBrick = dropLayer(DataBrick, "Emission")
ReducedBrick = dropLayer(ReducedBrick, "Green")
ReducedBrick = dropLayer(ReducedBrick, "Red")
ReducedBrick = dropLayer(ReducedBrick, "SWIR")
pairs(ReducedBrick)
ReducedLM = lmValidation(VCF ~ Blue + NIR, data=DataValues, printstep=TRUE)
##
## Call:
## lm(formula = ..1, data = ..2)
##
## Residuals:
## Min 1Q Median 3Q Max
## -76.341 -9.468 1.198 10.277 172.619
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 1.122e+02 9.092e-02 1234 <2e-16 ***
## Blue -2.005e-01 1.290e-04 -1555 <2e-16 ***
## NIR 8.266e-03 3.017e-05 274 <2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 14.73 on 1808281 degrees of freedom
## (13712 observations deleted due to missingness)
## Multiple R-squared: 0.5847, Adjusted R-squared: 0.5847
## F-statistic: 1.273e+06 on 2 and 1808281 DF, p-value: < 2.2e-16
##
## Start: AIC=9728271
## VCF ~ Blue + NIR
##
## Df Sum of Sq RSS AIC
## <none> 392372126 9728271
## - NIR 1 16289145 408661271 9801822
## - Blue 1 524603632 916975758 11263267
RedPrediction = predictValidation(ReducedBrick, model=ReducedLM, na.rm=TRUE,
filename="data/ReducedPredictionRaster.grd", overwrite=TRUE, range=c(0, +Inf),
truthlayer="VCF", plothist=TRUE, plotcomparison=TRUE)
names(RedPrediction) = "Predicted.LM.Blue.NIR"
RMSE(getValues(ReducedBrick[["VCF"]]), getValues(RedPrediction))
## [1] 14.69918
zonestatsR = StratifiedRMSE(DataBrick[["VCF"]], RedPrediction, trainingRaster,
zonenames=levels(trainingPoly@data$Class))
zonestatsR
## zone mean
## cropland 1 156.7424
## forest 2 63.1550
## wetland 3 154.6216
plot(raster("data/Predicted.LM.Blue.NIR.grd"))